@article{xiong2020layer,
  title={On layer normalization in the transformer architecture},
  author={Xiong, Ruibin and Yang, Yunchang and He, Di and Zheng, Kai and Zheng, Shuxin and Xing, Chen and Zhang, Huishuai and Lan, Yanyan and Wang, Liwei and Liu, Tie-Yan},
  journal={arXiv preprint arXiv:2002.04745},
  year={2020},
  url={https://arxiv.org/pdf/2002.04745.pdf}
}